home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
C/C++ Users Group Library 1996 July
/
C-C++ Users Group Library July 1996.iso
/
listings
/
v_11_08
/
1108036b
< prev
next >
Wrap
Text File
|
1993-05-03
|
23KB
|
582 lines
/***************************************************************
* file: ZONE.C
* purpose: text region detection via cellular automata
**************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "zone.h"
/* local data */
static ZONE zone_list[MAX_ZONES]; /* this could be dynamically allocated */
static int zone_count;
/* local prototypes */
static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley);
static void cut_vertical_lines(unsigned char *image,int dx,int dy);
static void block_zones(unsigned char *image,int dx,int dy,int coarseness);
static void sequence_zones(unsigned char *image,int dx,int dy,int order);
static int extract_zone(unsigned char *image,int dx,int dy,int x,int y,ZONE *zone_ptr);
static void overlap_zones(ZONE *zone_array,int *array_size);
/************************************************
* function: int zone(int coarseness)
* The process steps are:
* 1) Sample the page
* 2) Cut vertical lines from the page
* 3) Block out zones via cellular automata
* 4) Extract the zones
* 5) Sequence zones
* parameters: coarseness value 0-5, order (COLUMN or ROW)
* returns: 1 if OK or 0 if error, see errno
************************************************/
int zone(int coarseness,int order)
{
unsigned char *image;
int dx,dy;
int samplex,sampley;
int i;
if (coarseness < 0 || coarseness > 5)
{ /* test coarseness parameter */
errno = EINVAL;
return 0;
}
/* get a scaled copy of the page */
image = sample_page(&dx,&dy,&samplex,&sampley);
if (image == NULL) /* memory? */
return 0;
#if CUT_VERTICAL_LINES
cut_vertical_lines(image,dx,dy); /* remove boxes around text */
#endif
block_zones(image,dx,dy,coarseness); /* block out zones */
sequence_zones(image,dx,dy,order);
for (i = 0 ; i < zone_count ; i++)
{ /* translate to full page */
zone_list[i].x1 *= samplex;
zone_list[i].y1 *= sampley;
zone_list[i].x2 *= samplex;
zone_list[i].y2 *= sampley;
}
free(image); /* clean up */
return 1;
}
/**************** LOCAL FUNCTIONS ****************/
/************************************************
* function: static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley)
* Sample the page. Normally, the entire page is stored in memory. Since
* the memory requirements are typically a megabyte, the page, in DOS
* machines, is somewhere in extended memory. So that this demo can
* work on machines lacking extended memory, I sampled the page when I
* scaled it for display. See display_sample() below. However, I
* have #ifed around the code that is normally used to sample the page
* from the memory image. You need to provide two functions, as well as
* the extended memory handler. They are void memory_info(DISPLAY_BOX *);
* which gets the x/y extents of the image, and (unsigned char *
* memory_ptr(int y); which returns a pointer to a scan line in the memory
* image. Sample_page() creates a standardized, reduced image suitable for
* cellular automation. The sample has each byte, not bit, representing a
* pixel area. The byte is 0xff if black or 0x00 if white.
* The sampling procedure is important for region finding. If possible
* it should be a function of the density of the original image. If
* the image isn't square, for example a 200x100 fax file, then the
* x and y sampling should be adjusted accordingly. Since I don't
* have dpi information here, I am scaling it to a typical, 200dpi,
* value after it was adjusted for screen display.
* Note: Bit 7 is leftmost and 0 is rightmost; 1 bits are black, 0 are white.
* parameters: pointers to storage for the byte width and height of the sampled
* image and the sample bit distance in x and y
* returns: pointer to sampled page or NULL if no memory
************************************************/
static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley)
{
static unsigned char bit_mask[] = {0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01};
unsigned char *image,*line_ptr,*line_ptr2,*buff_ptr;
DISPLAY_BOX file;
unsigned int x,y,width,height;
memory_info(&file); /* need to provide this, gets file dimensions */
/* from memory image of file */
*samplex = SAMPLE_200_X; /* sample sizes */
*sampley = SAMPLE_200_Y;
*dx = file.width / *samplex; /* extent of sample */
*dy = file.height / *sampley;
while (((long)*dx * (long)*dy) > MAX_MALLOC)
{ /* adjust sampling to fit memory restrictions */
(*samplex)++;
(*sampley)++;
*dx = file.width / *samplex;
*dy = file.height / *sampley;
}
if ((image = malloc(*dx * *dy)) == NULL) /* allocate sample buffer */
return NULL;
memset(image,WHITE,*dx * *dy); /* set to white */
width = *dx * *samplex;
height = *dy * *sampley;
if (*samplex >= 8)
{ /* byte sampling */
for (y = 0, buff_ptr = image ; y < height ; y += *sampley)
{ /* for each y sample */
/* need to provide memory_ptr which gets a pointer
* to a scan line from the memory image of the file */
line_ptr = memory_ptr(y);
line_ptr2 = memory_ptr(y + *sampley/2); /* double sample in y */
for (x = 0 ; x < width ; x += *samplex, buff_ptr++)
if (*(line_ptr+(x>>3)) | *(line_ptr2+(x>>3)))
*buff_ptr = BLACK; /* if byte has black, set sample */
}
}
else
{ /* bit sampling */
for (y = 0, buff_ptr = image ; y < height ; y += *sampley)
{ /* for each y sample */
/* need to provide memory_ptr which gets a pointer
* to a scan line from the memory image of the file */
line_ptr = memory_ptr(y);
line_ptr2 = memory_ptr(y + *sampley/2); /* double sample in y */
for (x = 0 ; x < width ; x += *samplex, buff_ptr++)
if ((*(line_ptr+(x>>3)) | *(line_ptr2+(x>>3))) & bit_mask[x&7])
*buff_ptr = BLACK; /* if bit is black, set sample */
}
}
return image;
}
#if CUT_VERTICAL_LINES
/************************************************
* function: static void cut_vertical_lines(unsigned char *image,int dx,int dy)
* Remove vertical lines from sample. The purpose of this function is to
* unbox text. Removing the vertical box lines accomplishes this. Trying
* to remove the horizontal lines is dangerous because you might also remove
* the text.
* parameters: pointer to sampled image buffer and x/y extents of buffer
* returns: nothing
************************************************/
static void cut_vertical_lines(unsigned char *image,int dx,int dy)
{
int x,y,count,y1;
unsigned char *ptr,*qtr;
for (x = 0 ; x < dx ; x++) /* scan image left to right */
{
ptr = image+x;
count = 0;
for (y = 0 ; y < dy ; y++, ptr += dx)
{ /* scan up and down counting line pixels */
if (*ptr)
count++;
else
count = 0;
if (count >= VERTICAL_LINE_SIZE)
{ /* we have a veritcal line */
for (y1=y, qtr=ptr ; *ptr!=WHITE && y>=0 ; y--, ptr-=dx)
*ptr = WHITE; /* white out moving up */
for (y=y1+1, ptr=qtr+dx ;